---
title: "Descriptive analysis (imputed)"
author: "Jae Yeon Kim"
date: "`r Sys.Date()`"
output:
  html_document:
    df_print: paged
---

# Load packages 

```{r}
if (!require(pacman)) install.packages("pacman")

pacman::p_load(tidyverse, # tidyverse 
               plm, # panel data analysis
               pglm, # generalized panel data analysis
               clubSandwich, # robust SEs
               here, # computational reproducibility 
               glue, # gluing objects and strings 
               tidylog, # logging analysis
               naniar, # missing data 
               zeallot, # multiple assignments 
               readxl, 
               ggpubr,
               hrbrthemes, 
               wesanderson,
               broom, 
               sjPlot, 
               jtools, 
               patchwork, 
               broom.mixed, 
               estimatr, 
               stargazer,
               DeclareDesign,
               sensemakr,
               mice,
               usmap,
               janitor,
               modelsummary, 
               flextable,
               officer,
               extrafont,
               gt,
               bootstrap,
               lme4,
               clipr,
               infer)

source(here("functions/utils.r"))
source(here("functions/theme.R"))

ggplot2::theme_set(theme_bw())

# no scientific notation
options(scipen = 999)
```

# Load files 

```{r message = FALSE}
df <- read_csv(here("processed_data", "panel_data_recoded.csv"))[,-c(1:2)]
```

# Measurement check 

```{r}
df2 <- subset(df, wave == 2)

t.test(subset(df, wave == 2)$apa.discrim.rona, subset(df, wave == 3)$apa.discrim.rona) 

ch_score <- data.frame(ch_score = subset(df, wave == 2)$apa.discrim.rona - subset(df, wave == 3)$apa.discrim.rona)

ch_score %>%
  ggplot(aes(x = ch_score)) +
  geom_bar() +
  scale_x_binned() +
  labs(x = "Change score between waves 2 and 3",
       y = "Count")

dv <- subset(df, wave == 1)$gendiscrim

covat2 <- df %>%
  filter(wave == 2) %>%
  select(apa.discrim.rona, usborn, edu, DEM, GOP, age, male, chinese, indian)

covat3 <- df %>%
  filter(wave == 3) %>%
  select(apa.discrim.rona, usborn, edu, DEM, GOP, age, male, chinese, indian)

covat2$gendiscrim <- dv
covat3$gendiscrim <- dv
```

```{r}
# factorize dummy variables
covat2 <- factorize_dummy(covat2)
covat3 <- factorize_dummy(covat3)

mods_exo <- list(

  "2nd wave" = lm(apa.discrim.rona ~ gendiscrim + usborn + edu + DEM + GOP + age + male + chinese + indian, data = covat2),
  
  "3rd wave" = lm(apa.discrim.rona ~ gendiscrim + usborn + edu + DEM + GOP + age + male + chinese + indian, data = covat3)

  )
```

```{r}
modelsummary(mods_exo, output = "gt")
```

### Table 2

```{r}
modelsummary(mods_exo,
             fmt = 3,
             coef_omit = "Intercept",
             statistic = c("p = {p.value}"),
             #stars = TRUE,
             output = here("outputs", "table2.docx"),
             coef_rename = 
                c( "gendiscrim" = "W1 Gen discrim",
                   "apa.discrim.rona" = "COVID-19 discrim",
                   "usborn1" = "US born",
                   "edu" = "Education",
                   "age" = "Age",
                   "male1" = "Male",
                   "chinese1" = "Chinese",
                   "indian1" = "Indian",
                   "DEM1" = "Democratic Party",
                   "GOP1" = "Republican Party"),
             vcov = "robust")
```

# Summarize by group 

## Partisanship

```{r}
dodge <- position_dodge(width=0.9)

party1 <- df %>%
    group_by(wave, party.id) %>%
    count() %>%
    ggplot(aes(x = wave, y = n, fill = party.id)) +
    geom_col(position = dodge)

party2 <- df %>%
    mutate(party.id = if_else(party.id %in% c("Democrat", "Republican") | is.na(party.id), party.id, "Independent or Other")) %>%
    group_by(wave, party.id) %>%
    count() %>%
    ggplot(aes(x = wave, y = n, fill = party.id)) +
    geom_col(position = dodge)

party1 / party2 + plot_annotation(tag_levels = )
```

```{r}
var.list <- names(df)[str_detect(names(df), "disc")][c(1, 3)]

# rescale the discrimination perception variables
df$apa.discrim.rona <- df$apa.discrim.rona - .5
df$gendiscrim <- df$gendiscrim - .5

var.list <- c(var.list, c("lostjob", "incomereduc", "knowrona", "linkedfate", "idimport", "asnpride", "rona.behav.sneeze", "rona.behav.language", "rona.behav.walk", "rona.behav.transit", "rona.behav.other", "rona.behav.nochange", "2020likelyvote", "biden", "DEM", "DEM.strong", "GOP", "GOP.strong", "ronaunfair", "ronaunfairasian", "rona.apa.mistreat", "apa.responsible", "apa.have.rona", "apa.harassment", "whiteaffect", "blackaffect", "latinoaffect", "asianaffect"))

group_sum <- purrr::map_dfr(seq(var.list), group_mean) %>%
    mutate(variable = rep(var.list, each = 3)) %>%
    filter(!is.na(mean)) 
```

## Discrimination perception 

### Figure 1 

```{r}
group_sum %>%
    filter(str_detect(variable, "disc")) %>%
    mutate(variable = recode(variable, 
                             "apa.discrim.rona" = "COVID-19 discrimination",
                             "gendiscrim" = 
                                 "General discrimination")) %>%
    ggplot(aes(x = glue("{factor(wave_fac)} 2020"), y = mean,
               ymax = mean + ci_high, 
               ymin = mean - ci_low)) +
        geom_errorbar() +
        geom_col(alpha = 0.5) +
        #ggrepel::geom_text_repel(aes(label = glue("{round(mean, 3)*100}%", position = dodge), width = 0.25, col = "red")) +
        facet_wrap(~variable) +
        labs(title = "Changes in discrimination perception",
            x = "", y = "Average response") +
        geom_hline(yintercept = 0,  linetype = 'dotted', col = 'red', size = 1) +
        theme(legend.position = "none")

ggsave(here("outputs", "figure1.png"))
```

### Figure B2

```{r}
gen_density <- df %>%
  ggplot(aes(gendiscrim)) +
    geom_density() +
    facet_wrap(~wave_fac) +
    labs(x = "Response",
         y = "Density",
         title = "General discrimination") +
    geom_vline(xintercept = 0,  linetype = 'dotted', col = 'red', size = 1)

rona_density <- df %>%
  ggplot(aes(apa.discrim.rona)) +
    geom_density() +
    facet_wrap(~wave_fac) +
    labs(x = "Response",
         y = "Density",
         title = "COVID-19 discrimination") +
    geom_vline(xintercept = 0,  linetype = 'dotted', col = 'red', size = 1)

(gen_density / rona_density) + plot_annotation(tag_levels = "A")

ggsave(here("outputs", "figure_b2.png"))
```

```{r}
df$proxy <- rep(subset(df, wave == 1)$gendiscrim, 3)
```

```{r}
plot_w12 <- df %>%
    filter(wave %in% c(1,2)) %>%
    group_by(proxy) %>%
    summarise(mean = mean(apa.discrim.rona, na.rm = TRUE),
                  ci_high = ci.high(apa.discrim.rona),
                  ci_low = ci.low(apa.discrim.rona)) %>%
    mutate(group = 2)

plot_w13 <- df %>%
    filter(wave %in% c(1,3)) %>%
    group_by(proxy) %>%
    summarise(mean = mean(apa.discrim.rona, na.rm = TRUE),
                  ci_high = ci.high(apa.discrim.rona),
                  ci_low = ci.low(apa.discrim.rona)) %>%
    mutate(group = 3)

plot_w123 <- bind_rows(plot_w12, plot_w13)
```

### Figure 2

```{r}
plot_w123 %>%
    ggplot(aes(x = factor(proxy), y = mean,
               ymax = mean + ci_high, 
               ymin = mean - ci_low,
               col = factor(group))) +
        geom_pointrange() +
        #ggrepel::geom_text_repel(aes(label = round(mean, 2), y = mean), size = 5, vjust = -.5) +
        labs(x = "Response to general discrimination Q in W1", 
             y = "Average response to COVID-19 discrimination Q in W2 & W3",
             col = "Wave") +
        geom_hline(yintercept = 0,  linetype = 'dotted', col = 'red', size = 1) +
#        facet_wrap(~group) +
        #scale_fill_brewer(palette = "Set1") +
        #theme(legend.position =  "down") +
  ylim(c(-.5, .5)) 

ggsave(here("outputs", "figure2.png"), height = 6)
```

## Affect 

```{r}
group_sum %>%
  filter(wave_fac != "November") %>% 
  filter(str_detect(variable, "affect")) %>%
  mutate(variable = recode(variable, 
                           "asianaffect" = "Asian",
                           "latinoaffect" = "Latino",
                           "blackaffect" = "Black",
                           "whiteaffect" = "White")) %>%
  mutate(mean = round(mean, 2)) %>%
  ggplot(aes(x = wave_fac, y = mean,
               ymax = mean + ci_high, 
               ymin = mean - ci_low,
             col = variable)) +
        geom_pointrange() +
        labs(x = "", y = "Average response",
            col = "Affect") +
        geom_hline(yintercept = 3.5, linetype = 'dotted', col = 'red', size = 1) +
        scale_colour_brewer(palette = "Set1")
```

## COVID experience 

### Table C3 

```{r}
rona_hardship <- group_sum %>%
    filter(wave_fac %in% c("May", "November")) %>% 
    filter(str_detect(variable, "rona|fair|treat|job|income")) %>%
    filter(!str_detect(variable, "apa|nochange|2020|biden|DEM|GOP|affect")) %>%
    mutate(variable = recode(variable, 
                             "lostjob" = "Unemployment",
                             "incomereduc" = "Income reduction", 
                             "rona.behav.sneeze" = "Avoid coughing/sneezing",
                             "rona.behav.language" = 
                             "Avoid speaking Asian language",
                             "rona.behav.walk" = "Avoid walking in neighborhoods",
                             "rona.behav.transit" = "Avoid public transporation",
                             "rona.behav.other" = "Others",
                             "ronaunfairasian" = "Experienced COVID discrimination due to race",
                             "ronaunfair" = "Experienced COVID-19 discrimination",
                             "knowrona" = "Know others mistreated during COVID pandemic"),
                             ) %>%
  mutate(mean = mean*100) %>%
  flextable() %>%
    set_table_properties(layout = "autofit", width = .8) %>%
    save_as_docx(path = here("outputs", "table_c3.docx"))
```

```{r}
# Lost job 
lost_job_pct <- sum(!is.na(df %>%
  filter(wave == 2) %>% 
  pull(lostjob)))/(df %>% filter(wave == 2) %>% nrow()) %>%
  round(2)
  
round(lost_job_pct, 4) * 100 
```

5.57% participants in the wave 2 lost jobs. 

```{r}
income_reduction_pct <- sum(!is.na(df %>%
  filter(wave == 2) %>% 
  pull(incomereduc)))/(df %>% filter(wave == 2) %>% nrow()) %>%
  round(2)
round(income_reduction_pct, 4) * 100
```

### Table C4 

```{r}
df2 <- df2 %>%
    mutate(apa.discrim.rona = replace_na(apa.discrim.rona, 0),
           lostjob = replace_na(lostjob, 0),
           incomereduc = replace_na(incomereduc, 0),
           ronaunfairasian = replace_na(ronaunfairasian, 0),
           rona.apa.mistreat = replace_na(rona.apa.mistreat, 0))
```

```{r}
mod1 <- lm(apa.discrim.rona ~ lostjob + incomereduc, data = df2) 
mod2 <- lm(apa.discrim.rona ~ ronaunfairasian, data = df2)
mod3 <- lm(apa.discrim.rona ~ rona.apa.mistreat, data = df2)

set.seed(1234)
modelsummary(list(`Hardship` = mod1, `Experience` = mod2, `Witnesing` = mod3),
             fmt = 4,
             coef_omit = "Intercept",
             statistic = c("p = {p.value}"),
             #stars = TRUE,
             vcov = "robust",
             output = here("outputs", "table_c4.docx")) 
```

## Party ID

```{r}
dodge <- position_dodge(width=0.9)

group_sum %>%
    filter(variable %in% c("DEM", "GOP")) %>%
    ggplot(aes(x = glue("{factor(wave_fac)} 2020"), y = mean,
               ymax = mean + ci_high, 
               ymin = mean - ci_low)) +
        geom_errorbar() +
        geom_col(alpha = 0.5) +
        #ggrepel::geom_text_repel(aes(label = glue("{round(mean, 3)*100}%", position = dodge), width = 0.25, col = "red")) +
        facet_wrap(~variable) +
        labs(title = "Changes in discrimination perception",
            x = "", y = "Average response") +
        geom_hline(yintercept = 0,  linetype = 'dotted', col = 'red', size = 1) +
        theme(legend.position = "none")
```

# Regression analysis

## Imputation

```{r}
df_na_sum <- df %>%
  filter(wave != 1) %>%
  select(`2020likelyvote`, biden, gendiscrim, apa.discrim.rona, usborn, edu, income, DEM, GOP, age, male, wave, korean, chinese) %>%
  map_dfr(~is.na(.) %>% mean())
```

```{r}
imp <- mice(
  df %>%
    filter(wave != 1) %>%
    select(gendiscrim, apa.discrim.rona, usborn, edu, income, DEM, GOP, age, male, wave, korean, chinese),
    seed = 1234, # for reproducibility
    m = 5, # the number of imputations
    maxit = 10, # the max number of iterations 
    method = "pmm", # predictive mean method
    print = FALSE)

densityplot(imp, layout = c(1,2))

imputed <- mice::complete(imp)
```

```{r}
# Replace with the imputed values 

index <- 665:nrow(df)

df$apa.discrim.rona[index] <- imputed$apa.discrim.rona
df$gendiscrim[index] <- imputed$gendiscrim
df$DEM[index] <- imputed$DEM
df$GOP[index] <- imputed$GOP
df$age[index] <- imputed$age
```

```{r}
df %>%
  filter(wave != 1) %>%
  select(gendiscrim, apa.discrim.rona, usborn, edu, DEM, GOP, age, male, natorigin, biden, `2020likelyvote`) %>%
  vis_miss()
```

## Likely to vote 

```{r}
# rename the DV
df$likely_vote <- df$`2020likelyvote`

# rescale the discrimination perception variables
df$apa.discrim.rona <- df$apa.discrim.rona + .5
df$gendiscrim <- df$gendiscrim + .5

# create a proxy variable
df$proxy <- rep(subset(df, wave == 1)$gendiscrim, 3)

## check the proxy data distribution
df$proxy %>% table()

df$prior[df$proxy == 0.5] <- "Middle"
df$prior[df$proxy < 0.5] <- "Low"
df$prior[df$proxy > 0.5] <- "High"

min(df$proxy) ; max(df$proxy)
```

Group definition 

    - Middle = 0.5  
    - 1 >= High > 0.5 
    - 0 <= Low < 0.5 

```{r}
# create factor variables 

## covariates 
df$usborn <- factor(df$usborn)
df$male <- factor(df$male)
df$chinese <- factor(df$chinese)
df$indian <- factor(df$indian)
df$GOP <- factor(df$GOP)
df$DEM <- factor(df$DEM)
df$wave <- factor(df$wave)
df$nat_origin <- factor(df$natorigin)
```

```{r}
model.outs <- cal_model_outputs(df)
model.outs.low <- cal_model_outputs(df %>% filter(prior == "Low"))
model.outs.middle <- cal_model_outputs(df %>% filter(prior == "Middle"))
model.outs.high <- cal_model_outputs(df %>% filter(prior == "High"))
```

### Table 4 

```{r}
cm <- c("apa.discrim.rona" = "COIVD Discrim",
        "gendiscrim" = "General Discrim (Post COVID",
        "usborn1" = "US Born",
        "edu" = "Education",
        "DEM1" = "Democratic Party",
        "GOP1" = "Republican Party",
        "age" = "Age",
        "male1" = "Male",
        "wave3" = "Wave3",
        "proxy" = "Wave 1 General Discrim",
        "apa.discrim.rona:proxy" = "COVID Discrim* Pre-COVID General Discrim",
        "chinese1" = "Chinese",
        "indian1" = "Indian")
```

```{r}
turnout_mod <- list(
  
    "OLS" = lm(likely_vote ~ gendiscrim + apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + nat_origin, data = df),
    
    "Weighted" = lm(likely_vote ~ gendiscrim + apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + nat_origin, data = df, weight = weights),
    
    "Within" = plm(likely_vote ~ gendiscrim + apa.discrim.rona + DEM + GOP + age + nat_origin, data = df, index = c("pid", "wave"), model = "within"),
    
    "Interaction term" = lm(likely_vote ~ apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + proxy + apa.discrim.rona:proxy + nat_origin, data = df)
    
    )
```

```{r}
modelsummary(turnout_mod,
             fmt = 3,
             coef_map = cm,
             coef_omit = "Intercept|nat_origin",
             statistic = c("p = {p.value}"),
             output = here("outputs", "table4.docx"))
```

### Table C5 

```{r}
sense.out <- sensemakr(model = lm(likely_vote ~ gendiscrim + apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + nat_origin, data = df),
          treatment = "apa.discrim.rona", 
          benchmark_covariates = "usborn1",
          kd = 1:3, 
          ky = 1:3, 
          q = 1)

summary(sense.out) %>%
    flextable() %>%
    set_table_properties(layout = "autofit", width = .8) %>%
    save_as_docx(path = here("outputs", "table_c5.docx"))
```

## Candidate preference

### Table 5 

```{r}
biden_mod <- list(
    
    "Logit" = glm(biden ~ gendiscrim + apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + nat_origin, data = df, family = binomial),
    
    "Weighted" = glm(biden ~ gendiscrim + apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + nat_origin, data = df, family = binomial, weight = weights),
    
    "Within" = plm(biden ~ gendiscrim + apa.discrim.rona + DEM + GOP + age + nat_origin + wave, data = df, index = c("pid", "wave"), model = "within", family = binomial),    
    
    "Interaction" = glm(biden ~  apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + proxy + apa.discrim.rona:proxy + nat_origin, data = df, family = binomial)

  )
```

```{r}
modelsummary(biden_mod,
             fmt = 3,
             coef_map = cm,
             coef_omit = "Intercept|nat_origin",
             statistic = c("p = {p.value}"),
             output = here("outputs", "table5.docx"))
```

### Table C6

```{r}
sense.out.biden <- sensemakr(model = lm(biden ~ gendiscrim + apa.discrim.rona + gendiscrim + usborn + edu + DEM + GOP + age + male + wave + nat_origin, data = df),
          treatment = "apa.discrim.rona", 
          benchmark_covariates = "DEM1",
          kd = 1:3, 
          ky = 1:3, 
          q = 1)

summary(sense.out.biden) %>%
    flextable() %>%
    set_table_properties(layout = "autofit", width = .8) %>%
    save_as_docx(path = here("outputs", "table_c6.docx"))
```


### Table 6

```{r}
sub.models <- list(

    "High Discrimination Pre-COVID" = glm(biden ~ gendiscrim + apa.discrim.rona + usborn + edu + DEM + GOP + age + male + wave + chinese + indian, data = subset(df, prior == "High"), family = binomial, weight = weights),
            
    "Middle Discrimination Pre-COVID" = glm(biden ~ gendiscrim + apa.discrim.rona + usborn + edu + DEM + GOP + age + male + wave + chinese + indian, data = subset(df, prior == "Middle"), family = binomial, weight = weights),
            
    "Low Discrimination Pre-COVID" = glm(biden ~ gendiscrim + apa.discrim.rona + usborn + edu + DEM + GOP + age + male + wave + chinese + indian, data = subset(df, prior == "Low"), family = binomial, weight = weights)
    )
```

```{r}
modelsummary(sub.models,
             exponentiate = TRUE,
             fmt = 3,
             coef_map = cm,
             coef_omit = "Intercept",
             statistic = c("p = {p.value}"),
             #stars = TRUE,
             output = here("outputs", "table6.docx"),
             vcov = "robust")
```

### Figure B4 

```{r}
fit <- glm(biden ~ gendiscrim  + apa.discrim.rona + proxy + apa.discrim.rona:proxy + usborn + edu + DEM + GOP + age + male + wave + nat_origin, data = df, family = binomial, weight = weights)
```

```{r}
plot_model(fit, 
           type = "pred", 
           terms = c("apa.discrim.rona", "proxy"),
           color = viridis::plasma(n = 5)) +
  labs(title = glue("Predicted probabilities of support for Democratic candidate Biden"),
       col = "Pre-COVID general discriminatin perception",
       x = "COVID discrimination perception", 
       y = "Predicted probabilities") +
  theme(legend.position = "bottom")

ggsave(here("outputs", "figure_b4.png"), width = 7, height = 7)
```

## Correlations

### Figure B1 

```{r}
modelplot(lm(apa.discrim.rona ~ usborn + edu + DEM + GOP + age + male + wave_fac, data = subset(df, wave != 1)),
          coef_omit = "Intercept",
          vcov = "robust",
          coef_rename = 
                 c("usborn1" = "US born",
                   "edu" = "Education",
                   #"income" = "Income",
                   "DEM1" = "Democratic Party",
                   "GOP1" = "Republican Party",
                   "age" = "Age",
                   "male1" = "Male",
                   "wave_facNovember" = "Wave 3")) +
    geom_vline(xintercept = 0, col = "red", linetype = "dashed") +
    theme_classic()

ggsave(here("outputs", "figure_b1.png"))
```

### Figure B3 

```{r}
modelplot(lm(gendiscrim ~ usborn + edu + income + DEM + GOP + age + male, data = subset(df, wave == 1)),
          coef_omit = "Intercept",
          vcov = "robust",
          coef_rename = 
                 c("usborn1" = "US born",
                   "edu" = "Education",
                   "income" = "Income",
                   "DEM1" = "Democratic Party",
                   "GOP1" = "Republican Party",
                   "age" = "Age",
                   "male1" = "Male")) +
    geom_vline(xintercept = 0, col = "red", linetype = "dashed") +
    theme_classic()

ggsave(here("outputs", "figure_b3.png"))
```